
## Read in well header data and identify wells.
# Outputs into working.data.dir
# 'dt_classified_all_'%&%today()%&%'.RData'...The full R Workspace, except the dt.panel (well-by-month productionpanel)
# 'merged_dt_panel_'%&%today()%&%'.RData'...The dt.panel
dt = fread(di.data.folder%&%'/HPDIHeader.csv')
# dt = fread(di.data.folder%&%'/HPDIHeader.csv', nrows=1000)
setnames(dt, tolower(names(dt)))
dim(dt)

# dt.small = fread(di.data.folder%&%'/HPDIHeader.csv', nrows=5) 
# setnames(dt.small, tolower(names(dt.small)))
# grep('county', names(dt.small), v=T)

dt[, first_prod_date := as.Date(first_prod_date, format='%m-%d-%Y')]
dt[, last_prod_date := as.Date(last_prod_date, format='%m-%d-%Y')]
dt[, comp_date := as.Date(comp_date, format='%m-%d-%Y')]
dt[, spud_date := as.Date(spud_date, format='%m-%d-%Y')]

dt = dt[order(-first_prod_date)]

dt[,.N] # 1 million

#### Data cleaning. -----

# Nearly all (92 percent) of wells are clearly indicated as oil or gas wells in the raw data
dt[,.N/dt[,.N], by=prod_type][order(V1)][prod_type %in% c('GAS','OIL','CBM','OIL (CYCLIC STEAM)'), sum(V1)]
## Label wells as oil or gas
dt[prod_type=='OIL (CYCLIC STEAM)', prod_type := 'OIL']
dt[!(prod_type %in% c('OIL','GAS','CBM')), prod_type := 'Other']
dt[prod_type=='CBM', prod_type:='GAS']
dt[prod_type=='OIL', prod_type:='Oil']
dt[prod_type=='GAS', prod_type:='Gas']

dt[, mean(prod_type=='Other')] # 8% are Other (these were mostly O&G--5% of the 7.5%). 
# Classify these based on gor_cum

# Other wells look more like oil. Assign to gas unless the GOR is > the 90th percentile for oil wells.
oil.gor.p90 = dt[prod_type=='Oil', quantile(gor_cum, probs=0.9)]
oil.gor.p90

dt[prod_type=='Other', mean(gor_cum>oil.gor.p90)] # 24% of Other wells have GORs above Oil's p90. 
dt[prod_type=='Other' & gor_cum>oil.gor.p90, prod_type:='Gas'] # Classify these as Gas
dt[prod_type=='Other' & gor_cum>0, prod_type:='Oil'] # For the rest, label them as oil

# The obvious exception is ND wells, which are low gas and high oil
# Looking at this state-x-prod_type table, we see that 
# ND is the only state with a large number of "Other" wells
# that also has such lopsided skew towards oil.
# NY/CA are also a bit lopsided, but production from the "Other" wells
# in those states is skewed towards gas. Therefore ND needs to be labelled
# oil, but the rest can be labelled gas. This only affects ~500 wells of the 1 million
dt[, table(state, prod_type)]
dt[prod_type=='Other' & state=='ND', mean(log(liq_cum+1))]
dt[prod_type=='Other' & state=='ND', mean(log(gas_cum+1))]
dt[prod_type=='Other' & state=='NY', mean(log(liq_cum+1))]
dt[prod_type=='Other' & state=='NY', mean(log(gas_cum+1))]
dt[prod_type=='Other' & state=='CA', mean(log(liq_cum+1))]
dt[prod_type=='Other' & state=='CA', mean(log(gas_cum+1))]

# Classify the ND Other wells as oil
dt[prod_type=='Other' & state=='ND', prod_type:='Oil']
# Classify the remaining ones as gas
dt[prod_type=='Other', prod_type:='Gas']

# Final result: 494k oil wells, 550k gas wells (~1.05M total)
dt[, .N, by=prod_type]
dt[gor_cum==0, .N, by=prod_type]

### drop extra columns to economize on memory
names(dt)
dt[, .N, by=drill_type]
dt[, .N, by=prod_type]
keep.vars.regex = paste0('entity|(spud|comp|prod)_date|state|basin|total_depth|vert_depth|',
                         'prac_ip|api_no|well_id|latitude|longitude|_perf|gor_cum|',
                         'type|(gas|liq)*.cum|first12|eur|gor_cum|months_prod|',
                         'peak_(liq|gas)_|decl_12mo_(liq|gas)|sum12_(liq|gas)|county|gas_gath')
keep.vars = grep(keep.vars.regex, names(dt), v=T)

dt = dt[, ..keep.vars]
gc()

### Federal Lands Designation: Overlay well lat/lon variables with federal lands shape file -----
# locs = dt.map[, .(latitude, longitude)]
locs = dt[, .(longitude, latitude)]

# Pull in shape files
library(tidyverse)
library(sp)
library(dplyr)
library(sf)
library(rnaturalearth)

# Read in federal lands shape file
# We will use the "over" function to find which wells fall into the federal lands
# shape file
vessel = st_read(data.dir%&%'/GIS/fedlandp.gdb')
vessel = as(vessel,"Spatial") # convert to spatial point format

locs
coordinates(locs) <- c('longitude','latitude')
class(locs)
class(vessel)

proj4string(locs) <- proj4string(vessel)

# Read in oceans shape file to find offshore wells that are on state land (e.g., Alaska)
ocean = st_read(data.dir%&%'/GIS/ne_10m_ocean/ne_10m_ocean.shp')
ocean = as(ocean,"Spatial") # convert to spatial point format
proj4string(vessel); proj4string(ocean) # note: these have to be the same datum, or it won't work

# Test a location in Teddy Roosevelt Natl Park (should be in vessel)
loc.temp = locs[1]
loc.temp@coords[1] = -103.545638
loc.temp@coords[2] = 46.970701
over(loc.temp, vessel) # shows up. good.

# Test a location not in a park 
loc.temp = locs[1]
loc.temp@coords[1] = -77.037574
loc.temp@coords[2] = 38.909581
over(loc.temp, vessel) # doesn't show up. good

# Test a BLM location
loc.temp = locs[1]
loc.temp@coords[1] = -103.143009
loc.temp@coords[2] = 32.046985
over(loc.temp, vessel) # shows up as BLM. good

# Test an area NEAR a BLM location, but outside it
loc.temp = locs[1]
loc.temp@coords[1] = -103.095739
loc.temp@coords[2] = 32.032213
over(loc.temp, vessel) # doesn't show up. good

# Full dataset. locs is too big to do it. Need to do it in groups.
# Warning: this loop can take 2-4 hours to classify all 1 million wells.
N = dt[,.N]
G = 100 # split data into 100 groups
# G=5
N%%G # not quite evenly divisible by G. will have a few leftover, meaning other groups are short 1
N.per.G = ceiling(N/G)
N.per.G
grp.idx = split(1:N, f=rep(1:G, each=N.per.G))
length(unlist(grp.idx)); N
sapply(grp.idx, length)
st.time = Sys.time()
st.time
for (i in 1:G) {
  Sys.time()
  idx = grp.idx[[i]]
  
  over.fed.out = over(locs[idx], vessel)
  over.fed.out = as.data.table(over.fed.out)
  over.fed.out = cbind(as.data.table(locs[idx]), over.fed.out)
  
  over.ocean.out = over(locs[idx], ocean)
  over.ocean.out = as.data.table(over.ocean.out)
  setnames(over.ocean.out, old='featurecla', new='ocean')
  
  over.out.temp = cbind(as.data.table(locs[idx]), 
                        over.fed.out,
                        over.ocean.out[, .(ocean)])
  Sys.time()
  if (i==1) { 
    over.out = over.out.temp
  } else {
    over.out = rbind(over.out, over.out.temp)
  }
  rm(over.out.temp)
  message(i%&%' of '%&%G%&%' done')
  message(Sys.time())
}
ed.time = Sys.time()
difftime(ed.time, st.time)
over.out = over.out[,-c(1:2)] # drop duplicated lat/lon
setcolorder(over.out, c('latitude','longitude'))
setnames(over.out, tolower(names(over.out)))
na.omit(over.out)
rm(locs)
rm(ocean)

setnames(over.out, old='state', new='state.shape')
over.out.keep = grep('lat|long|feat|name|admin|ocean|state.shape', names(over.out), v=T)

dt = cbind(dt, over.out[, ..over.out.keep])
dt = dt[, unique(names(dt)), with=F] # drop duplicate lat/lon variables
rm(over.out)

# Use results to classify wells as on federal land.
dt[, fed.land := 1*!is.na(admin1)]
dt[state %in% c("FO GULF", "FO PACIFIC"), fed.land := 1] # fed lands shapefile doesn't tag Gulf as federal. Fix this
dt[, on.water := 0]
dt[ocean=='Ocean', on.water := 1]
dt[, sum(on.water), by=state]
dt[, offshore:= 1*(state %in% c("FO GULF", "FO PACIFIC") | on.water==1)]

# Alaska's offshore oil wells in Cook Inlet are federal, but not all AK offshore is federal:
# Prudhoe Bay offshore are state (Artic Slope, Arctic Ocean).
# https://www.blm.gov/programs/energy-and-minerals/oil-and-gas/about/alaska
# https://en.wikipedia.org/wiki/Prudhoe_Bay_Oil_Field
dt[state=='AK', table(fed.land, offshore, basin)]
dt[state=='AK' & offshore==1 & basin=='COOK INLET BASIN', fed.land:=1]

### Calculate monthly spud counts, by well type ----
dt[, spud_month := spud_date - day(spud_date) + 1]
dt[, spud_year := year(spud_date)]

dt[, first_prod_month := first_prod_date - day(first_prod_date) + 1]
dt[, first_prod_date := NULL] # it's always listed at day=1
dt[, first_prod_year := year(first_prod_month)]

elapsed_months <- function(end_date, start_date) {
  ed <- as.POSIXlt(end_date)
  sd <- as.POSIXlt(start_date)
  12 * (ed$year - sd$year) + (ed$mon - sd$mon)
}

dt[, spud_to_prod := elapsed_months(first_prod_month, spud_month)]

save('dt', file=working.data.dir%&%'/dt_classified_'%&%today()%&%'.RData')
save.image(file=working.data.dir%&%'/dt_classified_all_'%&%today()%&%'.RData')

## Bring in panel data to plot production over time ----
dt.panel = fread(di.data.folder%&%'/HPDIProduction.csv')
setnames(dt.panel, tolower(names(dt.panel)))
gc()

dt.panel[,.N]
dt.panel[,uniqueN(entity_id)]
dt[,.N]
dt.panel = merge(dt[,.(entity_id, first_prod_month, prod_type, drill_type, fed.land, offshore, state, county)], dt.panel, by='entity_id', all.y=FALSE)
dt.panel[,uniqueN(entity_id)]
dt.panel[,.N]
gc()

save('dt.panel', file=working.data.dir%&%'merged_dt_panel_'%&%today()%&%'.RData')
merged_panel_data_loc = working.data.dir%&%'merged_dt_panel_'%&%today()%&%'.RData'
